## load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## read in data
births = read.csv("data/Yr1116Birth.csv", na.strings = "9999")
deaths = read.csv("data/Yr1116Death.csv")
## rewrite NAs
births$SEX[which(births$SEX == 9)] = NA
births$CIGPN[which(births$CIGPN == 99)] = NA
births$CIGFN[which(births$CIGFN == 99)] = NA
births$CIGSN[which(births$CIGSN == 99)] = NA
births$CIGLN[which(births$CIGLN == 99)] = NA
births$PARITY[which(births$PARITY == 99)] = NA
births$PLUR[which(births$PLUR == 99)] = NA
births$GEST[which(births$GEST == 99)] = NA
births$MAGE[which(births$MAGE == 99)] = NA
summary(births$BWTG)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4 2948 3310 3258 3640 6294 430
sd(births$BWTG, na.rm = TRUE)
## [1] 618.8703
ggplot(births, aes(x= BWTG))+
geom_histogram() +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 430 rows containing non-finite values (stat_bin).
Birthweight is close to normally distrubted, with a slight left skew, centered around ~3300g with a standard deviation of 600g. There appear to be no large outliers in terms of birthweight. 430 birth weights are missing
births = births %>% mutate(
CIGPN_binary = CIGPN>0,
CIGFN_binary = CIGFN>0,
CIGSN_binary = CIGSN>0,
CIGLN_binary = CIGLN>0
)
births = births %>%
mutate(total_smoked = CIGFN+CIGSN+CIGLN) %>%
mutate(smoked_during = total_smoked >0)
mean(births$smoked_during,na.rm=TRUE)
## [1] 0.100175
mean(births$CIGPN_binary, na.rm = TRUE)
## [1] 0.1340957
mean(births$CIGFN_binar, na.rm = TRUE)
## [1] 0.09676666
mean(births$CIGSN_binary,na.rm = TRUE)
## [1] 0.08199303
mean(births$CIGLN_binary, na.rm = TRUE)
## [1] 0.07788803
ggplot(births %>% filter(smoked_during), aes(x = total_smoked))+
geom_histogram() +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mean(births %>% filter(smoked_during) %>% select(total_smoked) %>% unlist())
## [1] 23.06795
ggplot(births, aes(x= smoked_during, y = BWTG)) +
geom_boxplot()+
theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
lm_smoking_v_non = lm(data=births, BWTG~smoked_during)
summary(lm_smoking_v_non)
##
## Call:
## lm(formula = BWTG ~ smoked_during, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3276.1 -304.1 40.2 375.9 3012.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3281.064 0.764 4294.52 <2e-16 ***
## smoked_duringTRUE -231.237 2.414 -95.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 614.5 on 718931 degrees of freedom
## (2759 observations deleted due to missingness)
## Multiple R-squared: 0.0126, Adjusted R-squared: 0.0126
## F-statistic: 9175 on 1 and 718931 DF, p-value: < 2.2e-16
births = births %>%
mutate(smoking_type = ifelse(smoked_during,
ifelse(CIGPN_binary, "before and during", "during only"),
ifelse(CIGPN_binary, "before only", "none"))) %>%
mutate(smoking_type = relevel(as.factor(smoking_type), ref = 4))
ggplot(births, aes(x= smoking_type, y = BWTG)) +
geom_boxplot()+
theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
lm_smoking = lm(data=births, BWTG~smoking_type)
summary(lm_smoking)
##
## Call:
## lm(formula = BWTG ~ smoking_type, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3276.6 -304.6 40.5 375.4 3012.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3281.6171 0.7802 4205.990 < 2e-16 ***
## smoking_typebefore and during -232.0954 2.4538 -94.586 < 2e-16 ***
## smoking_typebefore only -13.4168 3.8483 -3.486 0.00049 ***
## smoking_typeduring only -223.2977 13.0480 -17.114 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 614.5 on 718904 degrees of freedom
## (2784 observations deleted due to missingness)
## Multiple R-squared: 0.01262, Adjusted R-squared: 0.01262
## F-statistic: 3063 on 3 and 718904 DF, p-value: < 2.2e-16
Around 13% of women smoked in the three months leading up to pregnancy and around 10% of women at any point during their pregnancy. Among those who did smoke during pregnance, the average number of cigarettes smoked during pregnancy was 23. The birthweight of children of smokers was significantly lower than that of the children of nonsmokers, with an average difference of 231 grams. There is also a significant relationship between birthweight and smoking before pregnancy, even for those who did not smoke during pregnancy.
# Check the parity frequencies
summary(births$PARITY)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.465 3.000 25.000 860
births$PARITY = as.numeric(births$PARITY)
births = births %>%
mutate(PARITY_truncated = ifelse(
PARITY > 4, "5+", PARITY)
)
births$PARITY = as.factor(births$PARITY)
ggplot(data = births, mapping = aes(x = PARITY, y = BWTG)) +
geom_boxplot() + xlab("Parity") + ylab("Birth weight (g)") +
ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
ggplot(births, aes(x = PARITY)) +
stat_count()
# Model without any change, significant up to the low teens
parity_reg = lm(BWTG ~ PARITY, births)
summary(parity_reg)
##
## Call:
## lm(formula = BWTG ~ PARITY, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3272.9 -303.2 50.9 382.5 3126.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3223.242 1.259 2559.590 < 2e-16 ***
## PARITY2 70.877 1.858 38.155 < 2e-16 ***
## PARITY3 63.680 2.124 29.976 < 2e-16 ***
## PARITY4 40.306 2.645 15.241 < 2e-16 ***
## PARITY5 11.815 3.451 3.424 0.000618 ***
## PARITY6 -11.166 4.751 -2.350 0.018757 *
## PARITY7 -32.906 6.538 -5.033 4.84e-07 ***
## PARITY8 -76.858 9.209 -8.345 < 2e-16 ***
## PARITY9 -70.364 12.447 -5.653 1.58e-08 ***
## PARITY10 -85.904 17.290 -4.968 6.75e-07 ***
## PARITY11 -126.567 22.324 -5.670 1.43e-08 ***
## PARITY12 -41.074 30.094 -1.365 0.172306
## PARITY13 -140.997 37.611 -3.749 0.000178 ***
## PARITY14 -157.146 55.261 -2.844 0.004459 **
## PARITY15 -20.675 62.728 -0.330 0.741706
## PARITY16 -158.545 82.550 -1.921 0.054782 .
## PARITY17 -64.515 107.530 -0.600 0.548530
## PARITY18 -61.242 145.592 -0.421 0.674019
## PARITY19 62.758 165.085 0.380 0.703829
## PARITY20 250.258 195.329 1.281 0.200120
## PARITY21 112.901 233.462 0.484 0.628673
## PARITY22 -80.242 356.616 -0.225 0.821972
## PARITY23 -530.242 617.674 -0.858 0.390645
## PARITY25 162.758 218.384 0.745 0.456100
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 617.7 on 720622 degrees of freedom
## (1046 observations deleted due to missingness)
## Multiple R-squared: 0.003304, Adjusted R-squared: 0.003272
## F-statistic: 103.9 on 23 and 720622 DF, p-value: < 2.2e-16
# Rerun EDA with truncated data
births$PARITY_truncated = as.factor(births$PARITY_truncated)
ggplot(births, aes(x = PARITY_truncated)) +
stat_count()
ggplot(data = births, mapping = aes(x = PARITY_truncated, y = BWTG)) +
geom_boxplot() + xlab("Parity Truncated") + ylab("Birth weight (g)") +
ggtitle("NC Births, 2011-2016")
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
parity_truncated_reg = lm(BWTG ~ PARITY_truncated, births)
summary(parity_truncated_reg)
##
## Call:
## lm(formula = BWTG ~ PARITY_truncated, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3272.9 -303.2 48.7 383.1 3061.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3223.242 1.259 2559.251 < 2e-16 ***
## PARITY_truncated2 70.877 1.858 38.150 < 2e-16 ***
## PARITY_truncated3 63.680 2.125 29.972 < 2e-16 ***
## PARITY_truncated4 40.306 2.645 15.239 < 2e-16 ***
## PARITY_truncated5+ -11.939 2.589 -4.612 3.99e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 617.8 on 720641 degrees of freedom
## (1046 observations deleted due to missingness)
## Multiple R-squared: 0.003014, Adjusted R-squared: 0.003008
## F-statistic: 544.6 on 4 and 720641 DF, p-value: < 2.2e-16
Independent of other variables, we see a negative relationship between parity and birth weight past the first child. The frequency of parity decreases in an exponential fashion. A second variable was created that truncates parities of at least five to improve interprability and prevent overfitting. The quantity of missing data is relatively small.
# Check the plurality frequencies
births$PLUR = as.numeric(births$PLUR)
births = births %>%
mutate(PLUR_truncated = ifelse(
PLUR > 2, "3+", PLUR)
)
births$PLUR = as.factor(births$PLUR)
ggplot(data = births, mapping = aes(x = PLUR, y = BWTG)) +
geom_boxplot() + xlab("Plurality") + ylab("Birth weight (g)") +
ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
ggplot(births, aes(x = PLUR)) +
stat_count()
plurality_reg = lm(BWTG ~ PLUR, births)
summary(plurality_reg)
##
## Call:
## lm(formula = BWTG ~ PLUR, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3287.1 -302.1 28.9 364.9 3001.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3292.1374 0.7085 4646.93 <2e-16 ***
## PLUR2 -969.8739 3.8495 -251.95 <2e-16 ***
## PLUR3 -1632.6058 21.4519 -76.11 <2e-16 ***
## PLUR4 -1767.3874 132.1681 -13.37 <2e-16 ***
## PLUR5 -1899.7374 186.9125 -10.16 <2e-16 ***
## PLUR6 -2999.9707 241.3023 -12.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 591.1 on 721251 degrees of freedom
## (435 observations deleted due to missingness)
## Multiple R-squared: 0.08785, Adjusted R-squared: 0.08784
## F-statistic: 1.389e+04 on 5 and 721251 DF, p-value: < 2.2e-16
births$PLUR_truncated = as.factor(births$PLUR_truncated)
ggplot(data = births, mapping = aes(x = PLUR_truncated, y = BWTG)) +
geom_boxplot() + xlab("Plurality Truncated") + ylab("Birth weight (g)") +
ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
ggplot(births, aes(x = PLUR_truncated)) +
stat_count()
plurality_reg_truncated = lm(BWTG ~ PLUR_truncated, births)
summary(plurality_reg_truncated)
##
## Call:
## lm(formula = BWTG ~ PLUR_truncated, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3287.1 -302.1 28.9 364.9 3001.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3292.1374 0.7085 4646.8 <2e-16 ***
## PLUR_truncated2 -969.8739 3.8496 -251.9 <2e-16 ***
## PLUR_truncated3+ -1649.6550 20.9622 -78.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 591.1 on 721254 degrees of freedom
## (435 observations deleted due to missingness)
## Multiple R-squared: 0.0878, Adjusted R-squared: 0.0878
## F-statistic: 3.471e+04 on 2 and 721254 DF, p-value: < 2.2e-16
We see a strong non linear negative relationship between plurality and birth weight. The frequency of pluralities above two is extremely small, and we again see a proportionally small amount of missing data. A second variable was created that truncates pluralities of at least three to improve interprability and prevent overfitting
summary(births$GEST)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.0 38.0 39.0 38.5 40.0 83.0 539
ggplot(data = births, mapping = aes(x = as.factor(GEST), y = BWTG)) +
xlab("Gestation Period (weeks)") +
ylab("Birth weight (g)") +
geom_boxplot() +
ggtitle("NC Births, 2011-2016") +
theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).
ggplot(births, aes(x = GEST)) +
stat_count()
## Warning: Removed 539 rows containing non-finite values (stat_count).
gest_reg = lm(BWTG ~ GEST, births)
summary(gest_reg)
##
## Call:
## lm(formula = BWTG ~ GEST, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9797.7 -299.6 -20.5 276.2 4752.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3898.2669 8.8696 -439.5 <2e-16 ***
## GEST 185.8433 0.2299 808.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 448.1 on 720801 degrees of freedom
## (889 observations deleted due to missingness)
## Multiple R-squared: 0.4754, Adjusted R-squared: 0.4754
## F-statistic: 6.532e+05 on 1 and 720801 DF, p-value: < 2.2e-16
max_gest = max(births$GEST, na.rm=TRUE)
births = births[which(births$GEST != max_gest), ]
There appears to be a non linear positive relationship between gestation period and birth weight. The mean gestational period is approximately 38.5 weeks and the period with the highest median weight is 42 weeks. The frequency distribution is left skewed with the majority of babies having a gestational period between 38 and 40 weeks. There is some concern that more extreme gestational periods may lead to higher variance, and it should be noted that there is a chunk of data points with gestational periods of 17 to 21 weeks that have much higher than expected birth weights. There is one outlier at gestation period of 83. We will assume that this data point was recorded incorrectly and will drop the observation from the dataset.
ggplot(data = births, mapping = aes(x = MAGE, y = BWTG)) +
xlab("Age of Mother (years)") +
ylab("Birth weight (g)") +
geom_smooth(method='lm', na.rm = TRUE) +
geom_point() +
ggtitle("NC Births, 2011-2016") +
theme_minimal()
## Warning: Removed 376 rows containing missing values (geom_point).
ggplot(births, aes(x = MAGE)) +
stat_count() +
theme_minimal()
## Warning: Removed 26 rows containing non-finite values (stat_count).
mage_reg = lm(BWTG ~ MAGE, births)
summary(mage_reg)
##
## Call:
## lm(formula = BWTG ~ MAGE, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3330.1 -302.4 49.5 383.3 3053.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3065.1300 3.4962 876.70 <2e-16 ***
## MAGE 6.9228 0.1229 56.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 617.3 on 720774 degrees of freedom
## (376 observations deleted due to missingness)
## Multiple R-squared: 0.004379, Adjusted R-squared: 0.004378
## F-statistic: 3170 on 1 and 720774 DF, p-value: < 2.2e-16
Mother’s age seems to be fairly normally distributed with a mean of 27.8. There appears to be a positive relationship between the age of the mother and the birth weight. There is no evidence to suggest that the birth weight variance is not constant across the mother’s age.
births$MRACER = as.factor(births$MRACER)
ggplot(data = births, mapping = aes(x = MRACER, y = BWTG)) +
xlab("Race of Mother") +
ylab("Birth weight (g)") +
geom_boxplot() +
ggtitle("NC Births, 2011-2016") +
theme_minimal()
## Warning: Removed 350 rows containing non-finite values (stat_boxplot).
bw_race_avgs = births %>%
group_by(MRACER) %>%
summarize(mweight = mean(BWTG, na.rm = TRUE), freq_percent = n()/nrow(births))
bw_race_avgs
## # A tibble: 9 x 3
## MRACER mweight freq_percent
## <fct> <dbl> <dbl>
## 1 0 3292. 0.117
## 2 1 3335. 0.586
## 3 2 3070. 0.243
## 4 3 3197. 0.0141
## 5 4 3285. 0.00463
## 6 5 3193. 0.000807
## 7 6 3148 0.000132
## 8 7 3218. 0.00294
## 9 8 3169. 0.0322
ggplot(births, aes(x = MRACER)) +
stat_count()
mage_reg = lm(BWTG ~ MRACER, births)
summary(mage_reg)
##
## Call:
## lm(formula = BWTG ~ MRACER, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3331.2 -301.2 48.3 378.8 3100.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3291.761 2.095 1571.537 < 2e-16 ***
## MRACER1 43.452 2.294 18.938 < 2e-16 ***
## MRACER2 -222.103 2.550 -87.091 < 2e-16 ***
## MRACER3 -95.067 6.395 -14.867 < 2e-16 ***
## MRACER4 -7.259 10.737 -0.676 0.498996
## MRACER5 -98.311 25.306 -3.885 0.000102 ***
## MRACER6 -143.761 62.455 -2.302 0.021345 *
## MRACER7 -73.905 13.372 -5.527 3.26e-08 ***
## MRACER8 -122.548 4.510 -27.172 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 608.4 on 720793 degrees of freedom
## (350 observations deleted due to missingness)
## Multiple R-squared: 0.0328, Adjusted R-squared: 0.03279
## F-statistic: 3055 on 8 and 720793 DF, p-value: < 2.2e-16
0 - non-white,, 1 = white, 2 black, 3 indian, 8 other asian
There are significant differences between the average birth weights of mother’s of different races. We see that mother’s that self identified as white have the largest mean baby weight at 3.33 kg, while black mother’s have the lowest mean baby weight at only 3.07 kg. 58 percent of mother’s identify as white, 24 percent identify as black, 12 percent identify as non-white, and 3 percent identify as other asian.
#calculate infant mortality by county, to use as a proxy for socioconomic status
deaths_by_county = deaths %>%
group_by(cores) %>%
summarize(n_deaths = n()) %>%
rename(CORES=cores)
births_by_county = births %>%
group_by(CORES) %>%
summarize(n_births = n())
infant_mortality = merge(deaths_by_county, births_by_county, by = "CORES") %>%
mutate(mortality = n_deaths/n_births*100) %>% #note: mortality rate is as percentage
select(-n_births, -n_deaths)
births = merge(births, infant_mortality, by = "CORES")
summary(births$mortality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1167 0.6152 0.7013 0.7208 0.7965 1.7606
ggplot(births, aes(x= mortality))+
geom_histogram() +
xlab("Mortality Rate (%)")+
theme_minimal() +
ggtitle("Histogram of Infant Mortality Rate by County")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
bwtg_vs_mortality = lm(data = births, BWTG~mortality)
summary(bwtg_vs_mortality)
##
## Call:
## lm(formula = BWTG ~ mortality, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3287.0 -303.7 45.5 382.8 3026.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3370.715 3.186 1057.90 <2e-16 ***
## mortality -156.823 4.304 -36.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 618.1 on 720800 degrees of freedom
## (350 observations deleted due to missingness)
## Multiple R-squared: 0.001839, Adjusted R-squared: 0.001837
## F-statistic: 1328 on 1 and 720800 DF, p-value: < 2.2e-16
ggplot(births, aes(x = mortality, y = BWTG))+
geom_point() +
geom_abline(slope = bwtg_vs_mortality$coefficients[[2]], intercept = bwtg_vs_mortality$coefficients[[1]])+
theme_minimal()
## Warning: Removed 350 rows containing missing values (geom_point).
We chose to use infant mortality rate of birth county as a proxy for socioeconomic status, calculated as number of deaths before the age of 1 divided by total number of births in a county. The median county in the data had a infant mortality rate of 0.7%, with the range of infant mortality rates in our dataset ranging from 0.12% to 1.76%. Infant mortality rate of birth county and birth weight appear to have a weak negative linear relationship, and a 1 percentage point increase in infant mortality rate is associated with a 157g decrease in expected birth weight.
births_excl = na.omit(births)
model1 = lm(data = births_excl, BWTG ~ GEST + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model1)
##
## Call:
## lm(formula = BWTG ~ GEST + PARITY_truncated + PLUR + smoking_type +
## MAGE + MRACER + mortality, data = births_excl, na.action = "na.exclude")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3406.6 -282.9 -19.5 260.9 4754.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3691.8459 9.9298 -371.795 < 2e-16 ***
## GEST 175.2110 0.2336 749.912 < 2e-16 ***
## PARITY_truncated2 94.7003 1.3162 71.951 < 2e-16 ***
## PARITY_truncated3 113.4122 1.5380 73.742 < 2e-16 ***
## PARITY_truncated4 117.1798 1.9259 60.844 < 2e-16 ***
## PARITY_truncated5+ 117.7761 1.9533 60.297 < 2e-16 ***
## PLUR2 -360.8848 2.9313 -123.116 < 2e-16 ***
## PLUR3 -459.6481 15.7032 -29.271 < 2e-16 ***
## PLUR4 -449.5572 95.9190 -4.687 2.78e-06 ***
## PLUR5 -249.7235 191.8140 -1.302 0.193
## PLUR6 -39.7109 175.1356 -0.227 0.821
## smoking_typebefore and during -204.8179 1.7697 -115.734 < 2e-16 ***
## smoking_typebefore only -11.4762 2.7026 -4.246 2.17e-05 ***
## smoking_typeduring only -165.3536 9.1293 -18.112 < 2e-16 ***
## MAGE 4.5897 0.0964 47.610 < 2e-16 ***
## MRACER1 83.5047 1.6596 50.318 < 2e-16 ***
## MRACER2 -99.3192 1.8206 -54.552 < 2e-16 ***
## MRACER3 -4.8482 4.5756 -1.060 0.289
## MRACER4 -33.5537 7.6016 -4.414 1.01e-05 ***
## MRACER5 -114.3439 17.9405 -6.373 1.85e-10 ***
## MRACER6 24.2038 44.0274 0.550 0.582
## MRACER7 -14.8690 9.4582 -1.572 0.116
## MRACER8 -102.9010 3.2079 -32.077 < 2e-16 ***
## mortality 23.2031 3.0989 7.487 7.03e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 428.9 on 717845 degrees of freedom
## Multiple R-squared: 0.5184, Adjusted R-squared: 0.5184
## F-statistic: 3.36e+04 on 23 and 717845 DF, p-value: < 2.2e-16
plot(model1$fitted.values, model1$residuals)
plot(births_excl$GEST, model1$residuals)
plot(births_excl$PLUR, model1$residuals)
plot(births_excl$PARITY_truncated, model1$residuals)
plot(births_excl$smoking_type, model1$residuals)
plot(births_excl$MRACER, model1$residuals)
plot(births_excl$MAGE, model1$residuals)
births_excl = births_excl %>%
mutate(GEST2 = GEST^2, GEST3 = GEST ^ 3)
model2 = lm(data = births_excl, BWTG ~ GEST + GEST2 + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model2)
##
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + PARITY_truncated + PLUR +
## smoking_type + MAGE + MRACER + mortality, data = births_excl,
## na.action = "na.exclude")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3412.1 -283.0 -19.5 260.7 4710.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.941e+03 4.049e+01 -72.632 < 2e-16 ***
## GEST 1.309e+02 2.328e+00 56.214 < 2e-16 ***
## GEST2 6.406e-01 3.348e-02 19.136 < 2e-16 ***
## PARITY_truncated2 9.592e+01 1.317e+00 72.814 < 2e-16 ***
## PARITY_truncated3 1.149e+02 1.540e+00 74.645 < 2e-16 ***
## PARITY_truncated4 1.190e+02 1.928e+00 61.733 < 2e-16 ***
## PARITY_truncated5+ 1.198e+02 1.956e+00 61.251 < 2e-16 ***
## PLUR2 -3.558e+02 2.943e+00 -120.897 < 2e-16 ***
## PLUR3 -4.603e+02 1.570e+01 -29.319 < 2e-16 ***
## PLUR4 -4.530e+02 9.589e+01 -4.724 2.31e-06 ***
## PLUR5 -2.735e+02 1.918e+02 -1.426 0.154
## PLUR6 -1.452e+02 1.752e+02 -0.829 0.407
## smoking_typebefore and during -2.041e+02 1.770e+00 -115.319 < 2e-16 ***
## smoking_typebefore only -1.152e+01 2.702e+00 -4.264 2.01e-05 ***
## smoking_typeduring only -1.651e+02 9.127e+00 -18.087 < 2e-16 ***
## MAGE 4.612e+00 9.638e-02 47.846 < 2e-16 ***
## MRACER1 8.372e+01 1.659e+00 50.458 < 2e-16 ***
## MRACER2 -9.902e+01 1.820e+00 -54.401 < 2e-16 ***
## MRACER3 -5.140e+00 4.574e+00 -1.124 0.261
## MRACER4 -3.341e+01 7.600e+00 -4.396 1.10e-05 ***
## MRACER5 -1.135e+02 1.794e+01 -6.330 2.46e-10 ***
## MRACER6 2.395e+01 4.402e+01 0.544 0.586
## MRACER7 -1.328e+01 9.456e+00 -1.404 0.160
## MRACER8 -1.020e+02 3.207e+00 -31.797 < 2e-16 ***
## mortality 2.391e+01 3.098e+00 7.717 1.19e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 428.8 on 717844 degrees of freedom
## Multiple R-squared: 0.5187, Adjusted R-squared: 0.5186
## F-statistic: 3.223e+04 on 24 and 717844 DF, p-value: < 2.2e-16
plot(model2$fitted.values, model2$residuals)
plot(births_excl$GEST, model2$residuals)
plot(births_excl$PLUR, model2$residuals)
plot(births_excl$PARITY_truncated, model2$residuals)
plot(births_excl$smoking_type, model2$residuals)
plot(births_excl$MRACER, model2$residuals)
plot(births_excl$MAGE, model2$residuals)
model3 = lm(data = births_excl, BWTG ~ GEST + GEST2 + GEST3 + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model3)
##
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + GEST3 + PARITY_truncated +
## PLUR + smoking_type + MAGE + MRACER + mortality, data = births_excl,
## na.action = "na.exclude")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3396.9 -280.5 -18.7 258.0 4875.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.702e+04 1.867e+02 91.151 < 2e-16 ***
## GEST -1.785e+03 1.766e+01 -101.111 < 2e-16 ***
## GEST2 6.015e+01 5.447e-01 110.436 < 2e-16 ***
## GEST3 -6.020e-01 5.500e-03 -109.463 < 2e-16 ***
## PARITY_truncated2 8.814e+01 1.308e+00 67.364 < 2e-16 ***
## PARITY_truncated3 1.055e+02 1.529e+00 68.985 < 2e-16 ***
## PARITY_truncated4 1.100e+02 1.914e+00 57.455 < 2e-16 ***
## PARITY_truncated5+ 1.118e+02 1.941e+00 57.628 < 2e-16 ***
## PLUR2 -3.303e+02 2.928e+00 -112.805 < 2e-16 ***
## PLUR3 -3.509e+02 1.560e+01 -22.488 < 2e-16 ***
## PLUR4 -2.910e+02 9.512e+01 -3.060 0.00222 **
## PLUR5 -1.592e+01 1.902e+02 -0.084 0.93330
## PLUR6 -5.172e+02 1.738e+02 -2.976 0.00292 **
## smoking_typebefore and during -2.033e+02 1.755e+00 -115.846 < 2e-16 ***
## smoking_typebefore only -1.131e+01 2.680e+00 -4.221 2.43e-05 ***
## smoking_typeduring only -1.651e+02 9.052e+00 -18.240 < 2e-16 ***
## MAGE 4.592e+00 9.559e-02 48.041 < 2e-16 ***
## MRACER1 8.254e+01 1.646e+00 50.158 < 2e-16 ***
## MRACER2 -9.966e+01 1.805e+00 -55.205 < 2e-16 ***
## MRACER3 -2.099e+00 4.537e+00 -0.463 0.64363
## MRACER4 -3.828e+01 7.537e+00 -5.079 3.80e-07 ***
## MRACER5 -1.184e+02 1.779e+01 -6.658 2.78e-11 ***
## MRACER6 2.413e+01 4.365e+01 0.553 0.58043
## MRACER7 -1.951e+01 9.378e+00 -2.080 0.03750 *
## MRACER8 -1.070e+02 3.181e+00 -33.620 < 2e-16 ***
## mortality 1.952e+01 3.073e+00 6.353 2.12e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 425.2 on 717843 degrees of freedom
## Multiple R-squared: 0.5266, Adjusted R-squared: 0.5265
## F-statistic: 3.194e+04 on 25 and 717843 DF, p-value: < 2.2e-16
plot(model3$fitted.values, model3$residuals)
plot(births_excl$GEST, model3$residuals)
plot(births_excl$PLUR, model3$residuals)
plot(births_excl$PARITY_truncated, model3$residuals)
plot(births_excl$smoking_type, model3$residuals)
plot(births_excl$MRACER, model3$residuals)
plot(births_excl$MAGE, model3$residuals)
births_excl = births_excl %>%
mutate(GEST4 = GEST^4)
model4 = lm(data = births_excl, BWTG ~ GEST + GEST2 + GEST3 + GEST4 + PARITY_truncated + PLUR_truncated + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model4)
##
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + GEST3 + GEST4 + PARITY_truncated +
## PLUR_truncated + smoking_type + MAGE + MRACER + mortality,
## data = births_excl, na.action = "na.exclude")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3398.1 -280.3 -19.4 257.3 4742.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.601e+04 7.646e+02 -20.934 < 2e-16 ***
## GEST 2.640e+03 1.009e+02 26.157 < 2e-16 ***
## GEST2 -1.565e+02 4.896e+00 -31.968 < 2e-16 ***
## GEST3 4.009e+00 1.037e-01 38.660 < 2e-16 ***
## GEST4 -3.610e-02 8.109e-04 -44.525 < 2e-16 ***
## PARITY_truncated2 8.561e+01 1.308e+00 65.458 < 2e-16 ***
## PARITY_truncated3 1.028e+02 1.528e+00 67.270 < 2e-16 ***
## PARITY_truncated4 1.076e+02 1.912e+00 56.276 < 2e-16 ***
## PARITY_truncated5+ 1.104e+02 1.938e+00 56.944 < 2e-16 ***
## PLUR_truncated2 -3.153e+02 2.943e+00 -107.137 < 2e-16 ***
## PLUR_truncated3+ -3.296e+02 1.528e+01 -21.565 < 2e-16 ***
## smoking_typebefore and during -2.024e+02 1.753e+00 -115.467 < 2e-16 ***
## smoking_typebefore only -1.114e+01 2.676e+00 -4.161 3.16e-05 ***
## smoking_typeduring only -1.647e+02 9.039e+00 -18.225 < 2e-16 ***
## MAGE 4.592e+00 9.546e-02 48.105 < 2e-16 ***
## MRACER1 8.213e+01 1.643e+00 49.981 < 2e-16 ***
## MRACER2 -9.986e+01 1.803e+00 -55.396 < 2e-16 ***
## MRACER3 -1.769e+00 4.531e+00 -0.390 0.6963
## MRACER4 -4.031e+01 7.527e+00 -5.355 8.57e-08 ***
## MRACER5 -1.203e+02 1.776e+01 -6.774 1.25e-11 ***
## MRACER6 2.233e+01 4.359e+01 0.512 0.6085
## MRACER7 -2.029e+01 9.365e+00 -2.167 0.0302 *
## MRACER8 -1.081e+02 3.177e+00 -34.040 < 2e-16 ***
## mortality 1.813e+01 3.069e+00 5.908 3.47e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 424.6 on 717845 degrees of freedom
## Multiple R-squared: 0.5279, Adjusted R-squared: 0.5278
## F-statistic: 3.489e+04 on 23 and 717845 DF, p-value: < 2.2e-16
plot(model4$fitted.values, model4$residuals)
plot(births_excl$GEST, model4$residuals)
plot(births_excl$PLUR_truncated, model4$residuals)
plot(births_excl$PARITY_truncated, model4$residuals)
plot(births_excl$smoking_type, model4$residuals)
plot(births_excl$MRACER, model4$residuals)
plot(births_excl$MAGE, model4$residuals)